import pandas as pd
import textacy
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

for domain in ['defund','deport']:
	data = pd.read_stata('data/raw/{}-receiver-deidentified.dta'.format(domain))
	data = data[["condition","openended"]]
	data["openended"] = data["openended"].apply(str.lower)
	data["nlp"] = data["openended"].apply(nlp)

	excuse = data[data['condition']=='excuse']
	noexcuse = data[data['condition']=='noexcuse']

	dfs = []
	for dataset in [excuse, noexcuse]:
		words = []
		ngrams = []
		for doc in dataset['nlp']:
			words.extend([t.text for t in textacy.extract.basics.ngrams(doc, (1,2,3))])

		df = pd.DataFrame.from_dict(Counter(words), orient='index').reset_index()
		df['condition'] = list(dataset['condition'].head(1))[0]
		df.columns = ['word','count','condition']
		dfs.append(df)

	df = pd.concat(dfs)
	df.to_csv('data/working/{}-receiver-wordcounts.dta'.format(domain), index=False)
